Text analysis: title and abstract of male and female speakers
Data
Data description and summary in script
0_data_summary.
Using data from titles and abstracts.
Formating titles
tit <- data %>% dplyr::select(id,gender,position_cat, audience_n,
title_english)
text_tit <- tit %>% unnest_tokens(output=word,
input=title_english)Total of 320 titles.
Number of titles per group
##
## F M
## 140 180
##
## F M
## others 4 1
## postdoc 25 32
## professor 24 72
## student 86 74
Formating abstracts
data_abs <- data %>% filter(!is.na(abstract_english))
text_tok <- data %>% dplyr::select(id,gender,position_cat, audience_n,
abstract_english) %>%
unnest_tokens(output=word,input=abstract_english)Total of 234 abstracts.
Number of abstracts per group
##
## F M
## 99 135
##
## F M
## others 4 1
## postdoc 21 21
## professor 20 57
## student 53 56
Tidytext
texts <- bind_rows(text_tit, text_tok)
stop_w <- tibble(word = stopwords("en"))
# remove stopwords
text <- texts %>%
anti_join(stop_w, by="word")%>% arrange(word)
# remove other non-words (numbersm characters) and stopwords
text <- text %>% dplyr::slice(-c(1:276)) %>% # number and some symbols
filter(nchar(word)!=1) %>% # letters alone
filter(!word %in% c("mpas", "ÎŽ13c", "ÎČ", "can", "aff", "agb"< "al"))
# solving some simple plurals
plural <- c("actions","advances", "adaptations", "amphibians", "animals",
"ants","anurans",
"applications","approaches", "bees","builds", "birds",
"cerrados","challenges", "biologists", "captures",
"continents","crops",
"decisions","declines","determines","determinants", "defenses",
"dynamics", "dunnocks",
"economics", "ecosystems","environments", "experiences",
"forests", "fruits", "fathers",
"genetics","gifts","gradients","guides","impacts", "islands",
"increases","interactions", "jobs", "lives", "insects",
"landscapes","males","mammals", "mangroves","models","movements",
"mutualisms","networks","neotropics",
"opilions","phenotypes","plants","projects","paths", "perspectives",
"purposes", "populations","promotes","relationships", "relations",
"resources","responses","roads","services","skulls","snakes","seeds",
"soils", "spaces", "spiders","stages", "trees", "variations",
"threats")
text$word[text$word %in% plural] <-
substr(text$word[text$word %in% plural],
1,nchar(text$word[text$word %in% plural])-1)- Grouping similar words:
lemma <- rbind(c("adaptive", "adaptation"),
c("advancement", "advance"),
c("agricultural", "agriculture"),
c("agro", "agriculture" ),
c("amazonia","amazon" ),
c("amazonian","amazon" ),
c("andean","andes"),
c("apply","application"),
c("applying","application"),
c("approaches", "approach"),
c("apidae","apis"),
c("arachnida","arachnid"),
c("argue","argument"),
c("basal", "basis"),
c("behavioral","behavior"),
c("behavioural","behavior"),
c("bignonieae", "bignoniaceae"),
c("biological", "biology"),
c("brazilian","brazil"),
c("building","build"),
c("butterflies", "butterfly"),
c("changing", "change"),
c("cnidarian", "cnidaria"),
c("coastal","coast"),
c("colour", "color"),
c("colors", "color"),
c("communities","community" ),
c("competitive", "competition"),
c("complexity", "complex"),
c("convergences", "convergence"),
c("convergent", "convergence"),
c("croplands","crop"),
c( "cultural", "culture"),
c("darwin's", "darwin"),
c("darwinian", "darwin"),
c("defensive", "defense"),
c("dependent","dependence"),
c("detecting","detection"),
c("determine", "determinant"),
c("developmental", "development"),
c("dispersers","dispersal"),
c("disturbed", "disturbance"),
c("diversification", "diversity"),
c("dragonflies", "dragonfly"),
c("drier", "drought"),
c("ecological", "ecology"),
c("ecologists", "ecology"),
c("endemic", "endemism"),
c("effectiveness", "efficiency"),
c("environmental", "environment"),
c("evolutionary", "evolution"),
c("expanding", "expansion"),
c("extinct", "extinction"),
c("facilitate", "facilitation"),
c("fisheries", "fishery"),
c("floral", "flora"),
c("floristic", "flora"),
c("forested", "forest"),
c("functional", "function"),
c("functionally", "function"),
c("functioning", "function"),
c("geographical", "geographic"),
c("heterogeneties", "heterogeneity"),
c("heterogeneous", "heterogeneity"),
c("histories", "history"),
c("integrated", "integration"),
c("intregating", "integration"),
c("integrative", "integration"),
c("invasive", "invasion"),
c("isotopic", "isotope"),
c("linking", "link"),
c("living", "live"),
c("mammalia", "mammal"),
c("managed", "manage"),
c("managers", "manage"),
c("mathematical", "mathematics"),
c("mates", "mating"),
c("mediated", "mediate"),
c("mechanistic", "mechanism"),
c("matrices", "matrix"),
c("migratory", "migration"),
c("mimicking", "mimicry"),
c("modeling", "model"),
c("mutualistic", "mutualism"),
c("natural", "nature"),
c("neotropical", "neotropic"),
c("northeastern", "northeast"),
c("occuring", "occur"),
c("onça", "onca"),
c("opiliones", "opilion"),
c("parasite", "parasitism"),
c("parent", "parenting"),
c("phylogenies", "phylogeny"),
c("phylogenetic", "phylogeny"),
c("phylogenomic", "phylogeny"),
c("pollinators", "pollination"),
c("protected", "protect"),
c("protective", "protect"),
c("rainfall", "rain"),
c("reconstructing", "reconstruction"),
c("regulatory", "regulation"),
c("regulates", "regulation"),
c("relation", "relationship"),
c("reproductive", "reproduction"),
c("restored", "restoration"),
c("robustness", "robust"),
c("scientific", "science"),
c("scientist", "science"),
c("sexy", "sexual"),
c("simulated", "simulation"),
c("societies", "society"),
c("social", "society"),
c("socio", "society"),
c("space", "spatial"),
c("spacio", "spatial"),
c("stabilize", "stability"),
c("stable", "stability"),
c("stories", "story"),
c("strategic", "strategy"),
c("strategies", "strategy"),
c("structured", "structure"),
c("structuring", "structure"),
c("studies", "study"),
c("studing", "study"),
c("sustainable", "sustainability"),
c("theories", "theory"),
c("theoretical", "theory"),
c("threatened", "threat"),
c("tropical", "tropic"),
c("vision", "visual")
)
lemma <- as.data.frame(lemma)
for (i in 1:dim(lemma)[1]){
text$word[text$word == lemma[i,1]] <- lemma[i,2]
}WORDS - all data
##
## F M
## 10932 13450
##
## F M
## others 260 137
## postdoc 2813 2562
## professor 1978 5082
## student 5790 5658
20 more common workds
| word | n |
|---|---|
| species | 388 |
| ecology | 196 |
| forest | 187 |
| study | 161 |
| model | 159 |
| evolution | 140 |
| environment | 138 |
| landscape | 126 |
| population | 126 |
| diversity | 117 |
| plant | 103 |
| community | 101 |
| male | 101 |
| different | 97 |
| nature | 94 |
| patterns | 89 |
| areas | 86 |
| interaction | 84 |
| present | 84 |
| animal | 82 |
| use | 82 |
Word cloud
par(mfrow=c(1,2))
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="F"])),
col="#6D57CF")
par(new=T)
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="M"])),
col="#FCA532")Word frequencies by gender
props <- text %>%
count(gender, word) %>%
group_by(gender) %>%
mutate(proportion = n / sum(n)) %>%
pivot_wider(names_from = gender, values_from = c(proportion,n)) %>%
mutate(abs.dif.p = abs(proportion_F-proportion_M),
rel.dif.p = pmax(proportion_F, proportion_M)/
pmin(proportion_F, proportion_M)) %>%
arrange(desc(abs.dif.p))
props$label <- NA
props$label[1:20] <- props$word[1:20]ggplot(props, aes(x=proportion_M,, y=proportion_F,
color=abs.dif.p)) +
geom_abline(color = "gray40", lty = 2) +
#geom_point(size=2.5, alpha=0.5)+
geom_jitter(size=2.5, alpha=0.2)+
geom_text_repel(aes(label=label), size=3.2)+
scale_x_log10(name="Male most used words",
labels = percent_format()) +
scale_y_log10(name="Female most used words",
labels = percent_format()) +
scale_color_gradient(name="Abs Diff",low = "blue", high = "red",
labels=percent_format()) +
theme(legend.justification = c(1, -0.1), legend.position = c(1, 0))Words that are close to the dashed line have similar frequencies in both genders. Words that are far from the line are words that are found more in one set of texts than another.
Legend: absolute differences in the frequency of the word by males and females. Differences above 0.3% are also indicated in text.
Correlation of word frequeency use between gender:
##
## Pearson's product-moment correlation
##
## data: props$proportion_F and props$proportion_M
## t = 70.485, df = 1651, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8538041 0.8778980
## sample estimates:
## cor
## 0.8663542
Highly correlated -> it means they tend to use the same frequency of main word
20 words with the largest differences in frequency
prop2 <- props %>% filter(!is.na(label)) %>%
arrange(desc(proportion_F), desc(proportion_M)) %>%
mutate(ntot = n_F + n_M) %>%
mutate(word = fct_reorder(word,(ntot),max),
proportion_F = proportion_F*-1) %>%
pivot_longer(2:3,names_to = "gender", values_to ="proportion")
ggplot(prop2, aes(x=proportion, y=word,fill=gender)) +
geom_col()+ ylab("") + xlab("Proportion")+
scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
labels=c("F", "M"))+
geom_vline(xintercept = c(-0.02,-0.01,0,0.01,0.02),
linetype="dotted",
col="darkgray") +
scale_x_continuous(breaks=c(-0.02,-0.01,0,0.01,0.02),
labels = c(0.02,0.01,0,0.01,0.02))TF IDF
The statistic tf-idf is intended to measure how important a word is to a document in a collection (or corpus) of documents, for example, to one novel in a collection of novels or to one website in a collection of websites.
Calculating tf-idf attempts to find the words that are important (i.e., common) in a text, but not too common. Letâs do that now.
10 âexclusiveâ words for each group
text_id$word <- as.factor(text_id$word)
text_id %>%
group_by(gender) %>%
arrange(desc(tf_idf)) %>%
top_n(10, tf_idf) %>%
ggplot(aes(x = tf_idf, y = reorder(word, tf_idf), fill = gender)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~gender, scales = "free") +
theme_minimal()WORDS - professors only data
##
## F M
## 1978 5082
par(mfrow=c(1,2))
textplot_wordcloud(x=dfm(tokens(textP$word[textP$gender=="F"])),
col="#6D57CF")
par(new=T)
textplot_wordcloud(x=dfm(tokens(textP$word[textP$gender=="M"])),
col="#FCA532")Mean number of words by abstract
textP %>% count(id,gender) %>%
ggplot(aes(x=gender, y=n)) +
geom_violin() + geom_boxplot(width=0.2)+
ggbeeswarm::geom_quasirandom(size=3, shape=21) 20 most commmon words
| word | n |
|---|---|
| species | 90 |
| ecology | 71 |
| evolution | 54 |
| population | 50 |
| environment | 47 |
| study | 45 |
| plant | 43 |
| model | 39 |
| diversity | 37 |
| nature | 36 |
| ecosystem | 33 |
| pollination | 32 |
| research | 31 |
| society | 30 |
| biology | 29 |
| science | 29 |
| interaction | 27 |
| amphibian | 25 |
| may | 25 |
| community | 24 |
Words Frequency by gender
propsP <- textP %>%
count(gender, word) %>%
group_by(gender) %>%
mutate(proportion = n / sum(n)) %>%
pivot_wider(names_from = gender, values_from = c(proportion,n)) %>%
mutate(abs.dif.p = abs(proportion_F-proportion_M),
rel.dif.p = pmax(proportion_F, proportion_M)/
pmin(proportion_F, proportion_M)) %>%
arrange(desc(abs.dif.p))
propsP$label <- NA
propsP$label[1:20] <- propsP$word[1:20]ggplot(propsP, aes(x=proportion_M, y=proportion_F,
color=abs.dif.p)) +
geom_abline(color = "gray40", lty = 2) +
# geom_point(size=2.5, alpha=0.3) +
geom_jitter(size=2.5, alpha=0.3)+
geom_text_repel(aes(label=label), size=3)+
scale_x_log10(name="Male most used words", limits=c(0.0003,0.02),
labels = percent_format()) +
scale_y_log10(name="Female Most used words", limits=c(0.0003,0.02),
labels = percent_format()) +
scale_color_gradient(name="Abs Diff",low = "blue", high = "red",
labels=percent_format()) +
theme(legend.justification = c(1, -0.1), legend.position = c(1, 0))Words that are close to the dashed line in these plots have similar frequencies in both genders. Words that are far from the line are words that are found more in one set of texts than another.
Legend: absolute differences in the frequency of the word by males and females.
Labels for the 20 words with largest differences in frequency.
Correlation of word frequeency use between gender:
##
## Pearson's product-moment correlation
##
## data: propsP$proportion_F and propsP$proportion_M
## t = 20.205, df = 518, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6128988 0.7093872
## sample estimates:
## cor
## 0.6638971
20 words with the largest differences in frequency
propP2 <- propsP %>% filter(!is.na(label)) %>%
arrange(desc(proportion_F), desc(proportion_M)) %>%
mutate(ntot = n_F + n_M) %>%
mutate(word = fct_reorder(word,(ntot),max),
proportion_F = proportion_F*-1) %>%
pivot_longer(2:3,names_to = "gender", values_to ="proportion")
ggplot(propP2, aes(x=proportion, y=word,fill=gender)) +
geom_col()+ ylab("") + xlab("Proportion")+
scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
labels=c("F", "M"))+
geom_vline(xintercept = c(-0.02,-0.01,0,0.01,0.02),
linetype="dotted",
col="darkgray") +
scale_x_continuous(breaks=c(-0.02,-0.01,0,0.01,0.02),
labels = c(0.02,0.01,0,0.01,0.02))TF IDF
text_idP <- textP %>% count(gender, word) %>%
bind_tf_idf(word, gender, n) %>%
arrange(desc(tf_idf))10 âexclusiveâ words for each group
text_idP$word <- as.factor(text_idP$word)
text_idP %>%
group_by(gender) %>%
arrange(desc(tf_idf)) %>%
top_n(10, tf_idf) %>%
ggplot(aes(x = tf_idf, y = reorder(word, tf_idf), fill = gender)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~gender, scales = "free") +
theme_minimal()Topic model - all data
LDA - latent Dirichlet allocation method for fiting topic model
It treats each document as a mixture of topics, and each topic as a mixture of words. This allows documents to âoverlapâ each other in terms of content, rather than being separated into discrete groups, in a way that mirrors typical use of natural lanâ guage.
Every document is a mixture of topics
Every topic is a mixture of words
matext <- text %>% count(id, gender, word) %>% mutate(id = paste(id, gender, sep="_")) %>%
select(-gender) %>%
cast_dtm(term=word,document=id,value=n)Choosing number of topics: comparing AIC
ap_lda2 <- LDA(matext, k = 2, control = list(seed = 1234))
ap_lda3 <- LDA(matext, k = 3, control = list(seed = 1234))
ap_lda4 <- LDA(matext, k = 4, control = list(seed = 1234))
ap_lda5 <- LDA(matext, k = 5, control = list(seed = 1234))
ap_lda10<- LDA(matext, k = 10, control = list(seed = 1234))
ap_lda20 <- LDA(matext, k = 20, control = list(seed = 1234))
bbmle::AICtab(ap_lda2, ap_lda3, ap_lda4,ap_lda5, ap_lda10,ap_lda20,
base=T)## AIC dAIC df
## ap_lda2 376887.9 0.0 9893
## ap_lda3 378653.0 1765.0 14839
## ap_lda4 382386.9 5498.9 19785
## ap_lda5 387562.8 10674.9 24731
## ap_lda10 420417.2 43529.2 49461
## ap_lda20 498853.0 121965.1 98921
two-topics model seems the most plausible model
Word-topic probabilities
10 words with the largest probabilities for each group
ap_topics <- tidy(ap_lda2, matrix = "beta")
ap_top_terms <- ap_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
ap_top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + coord_flip()
words with the greates difference in Beta between topics
beta_spread <- ap_topics %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .001 | topic2 > .001) %>%
mutate(log_ratio = log2(topic2 / topic1))
beta_spread## # A tibble: 264 Ă 4
## term topic1 topic2 log_ratio
## <chr> <dbl> <dbl> <dbl>
## 1 ability 0.00114 0.000103 -3.47
## 2 abundance 0.00175 0.000283 -2.63
## 3 activities 0.000290 0.00134 2.21
## 4 adaptation 0.00174 0.00109 -0.677
## 5 addition 0.00103 0.00102 -0.0213
## 6 affect 0.000867 0.00103 0.249
## 7 agriculture 0.000690 0.00176 1.35
## 8 also 0.00284 0.00273 -0.0557
## 9 although 0.000595 0.00108 0.856
## 10 amazon 0.000991 0.00115 0.217
## # âč 254 more rows
beta_spread %>%
arrange(log_ratio) %>% slice(c(1:5,260:264)) %>%
ggplot(aes(fct_reorder(term,log_ratio,min), log_ratio)) +
geom_col(show.legend = FALSE) + coord_flip() +
ylab("Log2 ration of beta in topic 2/topic 1") + xlab("Word")Document-topic probabilities - classifying the abstracts
and comparing the two groups by gender (if there is a difference in frequency)
ap_documents <- tidy(ap_lda2, matrix = "gamma")
classifi <- ap_documents %>% mutate(gender = substr(document, nchar(document), nchar(document))) %>%
group_by(document,gender) %>%
top_n(1, gamma)
table(classifi$gender, classifi$topic)##
## 1 2
## F 71 62
## M 101 76
classifi %>% tabyl(gender, topic) %>% adorn_percentages() %>%
adorn_pct_formatting(digits = 0) %>%
adorn_ns() %>% kable()| gender | 1 | 2 |
|---|---|---|
| F | 53% (71) | 47% (62) |
| M | 57% (101) | 43% (76) |
classifi %>%
# mutate(title = reorder(title, gamma * topic)) %>%
ggplot(aes(as.character(topic), gamma)) +
geom_boxplot() +
facet_wrap(~ gender)Topic model - Professors only
matextP <- textP %>%
count(id, gender, word) %>% mutate(id = paste(id, gender, sep="_")) %>%
select(-gender) %>%
cast_dtm(term=word,document=id,value=n)ap_lda2P <- LDA(matextP, k = 2, control = list(seed = 1234))
ap_lda3P <- LDA(matextP, k = 3, control = list(seed = 1234))
ap_lda4P <- LDA(matextP, k = 4, control = list(seed = 1234))
bbmle::AICtab(ap_lda2P, ap_lda3P, ap_lda4P,base=T)## AIC dAIC df
## ap_lda2P 107203.2 0.0 4955
## ap_lda3P 109181.9 1978.6 7432
## ap_lda4P 111348.9 4145.7 9909
word-topic probabilities
ap_topicsP <- tidy(ap_lda2P, matrix = "beta")
ap_top_termsP <- ap_topicsP %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
ap_top_termsP %>%
mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + coord_flip()
words with the greates difference in Beta between topics
beta_spread <- ap_topicsP %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .001 | topic2 > .001) %>%
mutate(log_ratio = log2(topic2 / topic1))
beta_spread## # A tibble: 373 Ă 4
## term topic1 topic2 log_ratio
## <chr> <dbl> <dbl> <dbl>
## 1 ability 1.38e- 3 3.09e- 4 -2.15
## 2 abundant 1.10e- 3 3.05e- 4 -1.85
## 3 academic 2.80e- 4 1.15e- 3 2.04
## 4 accepted 1.40e- 3 6.90e-59 -184.
## 5 across 1.68e- 3 5.74e- 4 -1.55
## 6 action 1.94e- 3 2.19e- 5 -6.47
## 7 activities 3.07e- 4 2.27e- 3 2.88
## 8 adaptation 8.59e- 4 4.57e- 3 2.41
## 9 address 5.42e- 4 1.17e- 3 1.10
## 10 advance 5.92e-10 2.01e- 3 21.7
## # âč 363 more rows
beta_spread %>%
arrange(log_ratio) %>% slice(c(1:5,260:264)) %>%
ggplot(aes(fct_reorder(term,log_ratio,min), log_ratio)) +
geom_col(show.legend = FALSE) + coord_flip() +
ylab("Log2 ration of beta in topic 2/topic 1") + xlab("Word")Document-topic probabilities
ap_documentsP <- tidy(ap_lda2P, matrix = "gamma")
classifiP <- ap_documentsP %>% mutate(gender = substr(document, nchar(document), nchar(document))) %>%
group_by(document,gender) %>%
top_n(1, gamma)
table(classifiP$gender, classifiP$topic)##
## 1 2
## F 10 14
## M 38 34
library(janitor)
classifiP %>% tabyl(gender, topic) %>% adorn_percentages() %>%
adorn_pct_formatting(digits = 0) %>%
adorn_ns() %>% kable()| gender | 1 | 2 |
|---|---|---|
| F | 42% (10) | 58% (14) |
| M | 53% (38) | 47% (34) |
classifiP %>%
# mutate(title = reorder(title, gamma * topic)) %>%
ggplot(aes(as.character(topic), gamma)) +
geom_boxplot() +
geom_violin()+
facet_wrap(~ gender)Chi-square test
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: classifiP$gender and classifiP$topic
## X-squared = 0.5, df = 1, p-value = 0.4795